#install.packages("tidycensus")
# Load necessary libraries
library(tidycensus)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

#install.packages("factoextra")
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("ggplot2")
#install.packages("usmap")
#install.packages("maps")
library(ggplot2)
library(usmap)
library(maps)

DF1 - EMPLOYMENT

#Abhay
#Employment - K202301 for 2021

# Get ACS data
df1 <- get_acs(geography = "state", 
               table = "K202301", 
               year = 2021, 
               survey = "acs1-year", 
               cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K202301 and caching the dataset for faster future access.
variables_df1 <- data.frame(
  variable = c("K202301_001", "K202301_002", "K202301_003", "K202301_004", "K202301_005", "K202301_006", "K202301_007"),
  label = c("Total", "In labor force:", "Civilian labor force:", "Employed", 
            "Unemployed", "In Armed Forces", 
            "Not in labor force")
)



# Joining the data with the variable labels to get the descriptive names
df1_labeled <- df1 %>%
  left_join(variables_df1, by = "variable")

# Reshapeing the data so that state names are rows and variable labels are columns
df1_wide <- df1_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
# Calculate employment rates
df1_wide <- df1_wide %>%
  mutate(
    EmploymentRate = `Employed` / `Total` * 100,
    UnemploymentRate = `Unemployed` / `Total` * 100,
    NotInLaborForceRate = `Not in labor force` / `Total` * 100
  )

# Plotting employment rates
library(ggplot2)

ggplot(df1_wide, aes(x = reorder(NAME, -EmploymentRate), y = EmploymentRate)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(x = "State", y = "Employment Rate (%)", title = "Employment Rates by State") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Plotting labor force participation
ggplot(df1_wide, aes(x = NAME)) +
  geom_bar(aes(y = `Employed`, fill = "Employed"), stat = "identity") +
  geom_bar(aes(y = `Unemployed`, fill = "Unemployed"), stat = "identity") +
  geom_bar(aes(y = `In Armed Forces`, fill = "Armed Forces"), stat = "identity") +
  scale_fill_manual(values = c("Employed" = "green", "Unemployed" = "red", "Armed Forces" = "blue")) +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(x = "State", y = "Number of People", fill = "Category", title = "Labor Force Participation by State")

ggplot(df1_wide, aes(x = UnemploymentRate)) +
  geom_histogram(binwidth = 1, fill = "tomato", color = "black") +
  labs(x = "Unemployment Rate (%)", y = "Number of States", title = "Distribution of Unemployment Rates across States")

# Correlation analysis
employment_data <- df1_wide %>%
  select(`Employed`, `Unemployed`, `In Armed Forces`, `Not in labor force`)

correlation_matrix <- cor(employment_data, use = "complete.obs")

# Visualize the correlation matrix
library(corrplot)
## corrplot 0.92 loaded
corrplot(correlation_matrix, method = "circle")

# Convert state names to abbreviations
state_abbreviations <- state.abb[match(df1_wide$NAME, state.name)]
df1_wide$state <- state_abbreviations

df1_wide$EmploymentRate <- df1_wide$`Employed` / df1_wide$Total * 100

# Plot the map with employment rate
plot_usmap(data = df1_wide, values = "EmploymentRate", labels = TRUE) +
  scale_fill_continuous(name = "Employment Rate (%)", label = scales::percent_format()) +
  theme(legend.position = "right") +
  labs(title = "Employment Rate by State in 2021")



DF 2 - Education

#Neha
#Education - K201501
df2 <- get_acs(geography = "state", 
                table = "K201501", 
                year = 2021, 
                survey = "acs1/subject",cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K201501 and caching the dataset for faster future access.
variables_df2 <- data.frame(
  variable = c("K201501_001", "K201501_002", "K201501_003", "K201501_004", "K201501_005", "K201501_006", "K201501_007", "K201501_008"),
  label = c("Education_Total_students", "Education_Below_9th grade", "Education_9th to 12th grade_no diploma", "Education_High_school_graduate", "Education_Some college_no degree", "Education_Associates_degree", "Education_Bachelors_degree", "Education_Graduate_professional degree")
)

df2_labeled <- df2 %>%
  left_join(variables_df2, by = "variable")

df2_wide <- df2_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)

Exploratory Data Analysis (EDA)

# Ensure necessary libraries are installed and loaded
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("tidyr")) install.packages("tidyr")
library(ggplot2)
library(tidyr)

# Convert df2_wide to long format
df2_long <- df2_wide %>%
  gather(key = "Education_Level", value = "Count", -NAME)

# Plotting the distribution of each education level across states
ggplot(df2_long, aes(x = NAME, y = Count, fill = Education_Level)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Distribution of Education Levels across States", 
       x = "State", 
       y = "Count")

Exploratory Data Analysis (EDA) based on percentage

# Ensure necessary libraries are installed and loaded
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("tidyr")) install.packages("tidyr")
if (!require("dplyr")) install.packages("dplyr")
library(ggplot2)
library(tidyr)
library(dplyr)

# Convert df2_wide to long format
df2_long <- df2_wide %>%
  gather(key = "Education_Level", value = "Count", -NAME)

# Calculate total count for each state
df2_long <- df2_long %>%
  group_by(NAME) %>%
  mutate(Total = sum(Count)) %>%
  ungroup()

# Calculate the percentage
df2_long <- df2_long %>%
  mutate(Percentage = (Count / Total) * 100)

# Plotting the distribution of each education level across states as a percentage
ggplot(df2_long, aes(x = NAME, y = Percentage, fill = Education_Level)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Percentage Distribution of Education Levels across States", 
       x = "State", 
       y = "Percentage")

library(ggplot2)
library(tidyr)
library(dplyr)

# Convert df2_wide to long format
df2_long <- df2_wide %>%
  gather(key = "Education_Level", value = "Count", -NAME)

# Calculate total count for each state only once
total_counts <- df2_long %>%
  group_by(NAME) %>%
  summarize(Total = sum(Count)) 

# Join with original data to add the Total column
df2_long <- df2_long %>%
  left_join(total_counts, by = "NAME")

# Calculate the percentage for each education level
df2_long <- df2_long %>%
  mutate(Percentage = (Count / Total) * 100)

# Create graphs for each educational level
unique_levels <- unique(df2_long$Education_Level)
plots <- list()

for (level in unique_levels) {
  df_subset <- filter(df2_long, Education_Level == level)

  p <- ggplot(df_subset, aes(x = NAME, y = Percentage, fill = Education_Level)) +
    geom_bar(stat = "identity") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    labs(title = paste("Percentage Distribution of", level, "across States"), 
         x = "State", 
         y = "Percentage") +
    guides(fill = guide_legend(title = "Education Level"))

  plots[[level]] <- p
}

# Print plots (optional)
for (plot in plots) {
  print(plot)
}

df2_percentages <- df2_wide %>%
  mutate(
    `Education_Below_9th grade_Percentage` = `Education_Below_9th grade` / Education_Total_students * 100,
    `Education_9th to 12th grade_no diploma_Percentage` = `Education_9th to 12th grade_no diploma` / Education_Total_students * 100,
    Education_High_school_graduate_Percentage = Education_High_school_graduate / Education_Total_students * 100,
    `Education_Some college_no degree_Percentage` = `Education_Some college_no degree` / Education_Total_students * 100,
    Education_Associates_degree_Percentage = Education_Associates_degree / Education_Total_students * 100,
    Education_Bachelors_degree_Percentage = Education_Bachelors_degree / Education_Total_students * 100,
    `Education_Graduate_professional degree_Percentage` = `Education_Graduate_professional degree` / Education_Total_students * 100
  )
percentage_columns <- grep("_Percentage$", names(df2_percentages), value = TRUE)
df2_long <- df2_percentages %>%
  pivot_longer(cols = percentage_columns, names_to = "Age_Group", values_to = "Percentage")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(percentage_columns)
## 
##   # Now:
##   data %>% select(all_of(percentage_columns))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(df2_long, aes(x = reorder(NAME, -Education_Total_students), y = Percentage, fill = Age_Group)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Age Distribution by State",
       x = "State",
       y = "Percentage of Total Population",
       fill = "Age Group") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  scale_fill_viridis_d()  # Adjust the color scale as needed



DF3 - Citizenship

#Esha
#Citizenship - K200501

df3<- get_acs(
  geography = "state", 
  table = "K200501", 
  year = 2021, 
  survey = "acs1/subject", 
  cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K200501 and caching the dataset for faster future access.
variables_df3 <- data.frame(
  variable = c("K200501_001", "K200501_002","K200501_003"),  
  label = c("Total","U.S. citizen", "Not a U.S. citizen")  
)

df3_labeled <- df3 %>%
  left_join(variables_df3, by = "variable")

df3_wide <- df3_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)

# Load necessary libraries
library(ggplot2)
library(tidyr)
library(dplyr)

# Convert df3_wide to long format
df3_long <- df3_wide %>%
  gather(key = "Citizenship", value = "Count", -NAME)

# Calculate total count for each state
df3_long <- df3_long %>%
  group_by(NAME) %>%
  mutate(Total = sum(Count)) %>%
  ungroup()

# Calculate the percentage
df3_long <- df3_long %>%
  mutate(Percentage = (Count / Total) * 100)

# Iterate over each unique citizenship category
for (category in unique(df3_long$Citizenship)) {
  # Filter data for the current citizenship category
  df3_subset <- df3_long %>% 
    filter(Citizenship == category)

  # Generate and print the plot for citizenship
  p <- ggplot(df3_subset, aes(x = NAME, y = Percentage, fill = Citizenship)) +
    geom_bar(stat = "identity") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    labs(title = paste("Percentage Distribution of", category, "across States"), 
         x = "State", 
         y = "Percentage")

  print(p)
}
## Warning: Removed 1 rows containing missing values (`position_stack()`).

## Warning: Removed 1 rows containing missing values (`position_stack()`).

## Warning: Removed 1 rows containing missing values (`position_stack()`).



DF4 - Age

#Srika
#Age - K200104
df4 <- get_acs(geography = "state", 
                table = "K200104", 
                year = 2021,
                survey = "acs1/subject",cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K200104 and caching the dataset for faster future access.
variables_df4 <- data.frame(
  variable = c("K200104_001", "K200104_002", "K200104_003", "K200104_004", "K200104_005", "K200104_006", "K200104_007", "K200104_008"),

  label = c("Total_age", "Age_under_18", 
            "Age_18_to_24", "Age_25_to_34", 
            "Age_35_to_44", "Age_45_to_54", "Age_55_to_64","Age_over_64")
)

df4_labeled <- df4 %>%
  left_join(variables_df4, by = "variable")

df4_wide <- df4_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
df4_percentages <- df4_wide %>%
  mutate(
    Age_under_18_Percentage = Age_under_18 / Total_age * 100,
    Age_18_to_24_Percentage = Age_18_to_24 / Total_age * 100,
    Age_25_to_34_Percentage = Age_25_to_34 / Total_age * 100,
    Age_35_to_44_Percentage = Age_35_to_44 / Total_age * 100,
    Age_45_to_54_Percentage = Age_45_to_54 / Total_age * 100,
    Age_55_to_64_Percentage = Age_55_to_64 / Total_age * 100,
    Age_over_64_Percentage = Age_over_64 / Total_age * 100
  )
percentage_columns <- grep("_Percentage$", names(df4_percentages), value = TRUE)
df4_long <- df4_percentages %>%
  pivot_longer(cols = percentage_columns, names_to = "Age_Group", values_to = "Percentage")
ggplot(df4_long, aes(x = reorder(NAME, -Total_age), y = Percentage, fill = Age_Group)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Age Distribution by State",
       x = "State",
       y = "Percentage of Total Population",
       fill = "Age Group") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  scale_fill_viridis_d()  # Adjust the color scale as needed

# Load necessary libraries if not already loaded
library(ggplot2)
library(tidyr)

# Calculate percentages for each age group
df4_percentages <- df4_wide %>%
  mutate(
    Age_under_18_Percentage = Age_under_18 / Total_age * 100,
    Age_18_to_24_Percentage = Age_18_to_24 / Total_age * 100,
    Age_25_to_34_Percentage = Age_25_to_34 / Total_age * 100,
    Age_35_to_44_Percentage = Age_35_to_44 / Total_age * 100,
    Age_45_to_54_Percentage = Age_45_to_54 / Total_age * 100,
    Age_55_to_64_Percentage = Age_55_to_64 / Total_age * 100,
    Age_over_64_Percentage = Age_over_64 / Total_age * 100
  )

# Filter columns based on names ending with "Percentage"
percentage_columns <- grep("_Percentage$", names(df4_percentages), value = TRUE)

# Reshape the data to a longer format
df4_long <- df4_percentages %>%
  pivot_longer(cols = percentage_columns, names_to = "Age_Group", values_to = "Percentage")

# Create a facet grid for each age group
ggplot(df4_long, aes(x = reorder(NAME, -Total_age), y = Percentage, fill = Age_Group)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_grid(Age_Group ~ ., scales = "free_y") +
  labs(title = "Age Distribution by State",
       x = "State",
       y = "Percentage of Total Population",
       fill = "Age Group") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  scale_fill_viridis_d()  # Adjust the color scale as needed

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.2     ✔ stringr   1.5.0
## ✔ purrr     1.0.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::map()    masks maps::map()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
Age_data <- get_acs(geography = "state", 
                table = "K200104", 
                year = 2021, 
               geometry = TRUE,
               resolution = "20m",
                survey = "acs1/subject",cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Loading ACSSE variables for 2021 from table K200104 and caching the dataset for faster future access.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |======================================================================| 100%
Mapping_age<-Age_data%>%
  filter(variable=="K200104_007")%>%
  shift_geometry()
ggplot(data = Mapping_age, aes(fill = estimate)) + 
  geom_sf()+
  scale_fill_distiller(palette = "RdPu", 
                       direction = 1) + 
  labs(title = "Median Age by State, 2021",
       caption = "Data source: 2021 1-year ACS, US Census Bureau",
       fill = "ACS estimate") + 
  theme_void()



DF5 - Housing

#Niharika
# Housing - K202502
df5 <- get_acs(geography = "state", 
                table = "K202502", 
                year = 2021, 
                survey = "acs1", # removed /subject since we're referring to a specific table ID
                cache_table = TRUE)
## Getting data from the 2021 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K202502 and caching the dataset for faster future access.
# Define the variable codes and their respective labels for the housing dataset (df5)
variables_df5 <- data.frame(
  variable = c("K202502_001", "K202502_002", "K202502_003"),
  label = c("Total", "Owner Occupied", "Renter Occupied")
)

# Assuming df5 is your housing dataset
# Reshape the data so that state names are rows
df5_labeled <- df5 %>%
  left_join(variables_df5, by = "variable")  # Joining with the variable labels dataframe

# Reshape the data to have state names as rows and variable labels as columns
df5_wide <- df5_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)

# df5_wide will have states as rows and different housing-related variables as columns
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
# Assuming df5_wide is your dataset for housing
# Melt the data to long format for easy plotting
df5_long <- tidyr::gather(df5_wide, key = "Housing_Category", value = "Count", -NAME)

# Calculate the percentage for Owner and Renter Occupied categories
df5_long <- df5_long %>%
  group_by(NAME) %>%
  mutate(Total = sum(Count[which(Housing_Category == "Total")])) %>%
  ungroup() %>%
  mutate(Percentage = ifelse(Housing_Category != "Total", Count / Total * 100, Count))

# Define colors for each category
category_colors <- c("Owner Occupied" = "steelblue", "Renter Occupied" = "salmon", "Total" = "lightgreen")

# Function to create a bar plot
create_bar_plot <- function(data, category, title) {
  ggplot(data[data$Housing_Category == category, ], aes(x = reorder(NAME, -Percentage), y = Percentage, fill = Housing_Category)) +
    geom_bar(stat = "identity", position = position_dodge()) +
    labs(x = "State", y = "Percentage", title = title) +
    scale_y_continuous(labels = percent_format(scale = 1)) +
    scale_fill_manual(values = category_colors) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
}

# Plot for Owner Occupied
owner_plot <- create_bar_plot(df5_long, "Owner Occupied", "Percentage of Owner Occupied Housing by State")
print(owner_plot)

# Plot for Renter Occupied
renter_plot <- create_bar_plot(df5_long, "Renter Occupied", "Percentage of Renter Occupied Housing by State")
print(renter_plot)

#install.packages("tidycensus")
# Load necessary libraries
library(tidycensus)
library(dplyr)
library(tidyr)

#install.packages("factoextra")
library(factoextra)

#install.packages("ggplot2")
#install.packages("usmap")
#install.packages("maps")
library(ggplot2)
library(usmap)
library(maps)

DF6 - Disabilities

#Abhay
#Disabilities - K201803
# Get ACS data for the Disabilities dataset
df6 <- get_acs(geography = "state", 
               table = "K201803", 
               year = 2021, 
               survey = "acs1-year", 
               cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K201803 and caching the dataset for faster future access.
# Assuming the variables_df6 mapping is similar to variables_df1 but for the Disabilities table
# You need to create variables_df6 with the correct variable codes and labels for the Disabilities data
variables_df6 <- data.frame(
  variable = c("K201803_001", "K201803_002", "K201803_003", "K201803_004", "K201803_005", "K201803_006", "K201803_007","K201803_008","K201803_009"),
  label = c("Total_people", "Total With Disabilities", 
            "Hearing", "Vision difficulty", 
            "cognative", "ambulatory difficulty", 
            "Self-care difficulty","Independent living difficulty","No Disability")
)

# Join your data with the variable labels to get the descriptive names
df6_labeled <- df6 %>%
  left_join(variables_df6, by = "variable")

# Reshape the data so that state names are rows and variable labels are columns
df6_wide <- df6_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
df6_wide
## # A tibble: 52 × 10
##    NAME          Total_people Total With Disabilit…¹ Hearing `Vision difficulty`
##    <chr>                <dbl>                  <dbl>   <dbl>               <dbl>
##  1 Alabama            4957633                 808071  208028              152798
##  2 Alaska              702154                  92390   33397               15748
##  3 Arizona            7174053                 972252  298849              180792
##  4 Arkansas           2974701                 517051  142133              105624
##  5 California        38724294                4324355 1140131              844049
##  6 Colorado           5715497                 640346  211803              120570
##  7 Connecticut        3557526                 427014  113490               78078
##  8 Delaware            987964                 130551   37933               25335
##  9 District of …       659979                  76754   14429               14569
## 10 Florida           21465883                2906367  812248              555361
## # ℹ 42 more rows
## # ℹ abbreviated name: ¹​`Total With Disabilities`
## # ℹ 5 more variables: cognative <dbl>, `ambulatory difficulty` <dbl>,
## #   `Self-care difficulty` <dbl>, `Independent living difficulty` <dbl>,
## #   `No Disability` <dbl>
# df6_wide now has states as rows and the descriptive variable labels as columns
# Calculate percentages
df6_wide <- df6_wide %>%
  mutate(DisabilityRate = `Total With Disabilities` / `Total_people` * 100)

# Plotting with ggplot2
library(ggplot2)

ggplot(df6_wide, aes(x = reorder(NAME, -DisabilityRate), y = DisabilityRate)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(x = "State", y = "Disability Rate (%)", title = "Disability Rates by State") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Select only the disability-related columns for correlation
disability_data <- df6_wide %>%
  select(`Hearing`, `Vision difficulty`, `cognative`, `ambulatory difficulty`, `Self-care difficulty`, `Independent living difficulty`)

# Compute correlation matrix
correlation_matrix <- cor(disability_data, use = "complete.obs")  # use = "complete.obs" handles missing values

# Check if corrplot is installed; if not, install it
if (!require(corrplot)) install.packages("corrplot")

library(corrplot)
corrplot(correlation_matrix, method = "circle")

# Comparative analysis of disabilities within a state (example: Iowa)
iowa_data <- df6_wide %>%
  filter(NAME == "Iowa") %>%
 select(`Hearing`, `Vision difficulty`, `cognative`, `ambulatory difficulty`, `Self-care difficulty`, `Independent living difficulty`)

# Plotting the data for Iowa
barplot(as.matrix(iowa_data), beside = TRUE, legend.text = TRUE, args.legend = list(x = "topright"))

# Predicting 'Ambulatory difficulty' based on other disabilities
lm_model <- lm(`ambulatory difficulty` ~ `Hearing` + `Vision difficulty` + `cognative` + `Self-care difficulty` + `Independent living difficulty`, data = df6_wide)

# Summary of the linear model
summary(lm_model)
## 
## Call:
## lm(formula = `ambulatory difficulty` ~ Hearing + `Vision difficulty` + 
##     cognative + `Self-care difficulty` + `Independent living difficulty`, 
##     data = df6_wide)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -56621 -11941  -2766  17524  73868 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -8200.5856  5646.4977  -1.452 0.153197    
## Hearing                             0.6796     0.1793   3.791 0.000436 ***
## `Vision difficulty`                 1.0070     0.1644   6.126 1.87e-07 ***
## cognative                          -0.5426     0.2291  -2.369 0.022116 *  
## `Self-care difficulty`             -1.9464     0.4434  -4.390 6.59e-05 ***
## `Independent living difficulty`     1.9228     0.3144   6.115 1.95e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24940 on 46 degrees of freedom
## Multiple R-squared:  0.9968, Adjusted R-squared:  0.9965 
## F-statistic:  2909 on 5 and 46 DF,  p-value: < 2.2e-16
lm_model
## 
## Call:
## lm(formula = `ambulatory difficulty` ~ Hearing + `Vision difficulty` + 
##     cognative + `Self-care difficulty` + `Independent living difficulty`, 
##     data = df6_wide)
## 
## Coefficients:
##                     (Intercept)                          Hearing  
##                      -8200.5856                           0.6796  
##             `Vision difficulty`                        cognative  
##                          1.0070                          -0.5426  
##          `Self-care difficulty`  `Independent living difficulty`  
##                         -1.9464                           1.9228
# K-means clustering
#Perform a clustering analysis to identify groups of states with similar disability profiles.


# K-means clustering with 3 centers
set.seed(123)  # For reproducibility
clusters <- kmeans(disability_data, centers = 3)

# Add cluster assignment to the data
df6_wide$Cluster <- as.factor(clusters$cluster)

pca_res <- prcomp(disability_data, scale. = TRUE)
fviz_pca_biplot(pca_res, label = "none", col.ind = df6_wide$Cluster, addEllipses = TRUE)